import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import plotly.express as px
%matplotlib inline
data = pd.read_excel("C:\\Users\\kottu\\OneDrive\\Documents\\Shopify\\2019 Winter Data Science Intern Challenge Data Set.xlsx")
data.head()
| order_id | shop_id | user_id | order_amount | total_items | payment_method | created_at | |
|---|---|---|---|---|---|---|---|
| 0 | 16 | 42 | 607 | 704000 | 2000 | credit_card | 2017-03-07 04:00:00 |
| 1 | 61 | 42 | 607 | 704000 | 2000 | credit_card | 2017-03-04 04:00:00 |
| 2 | 521 | 42 | 607 | 704000 | 2000 | credit_card | 2017-03-02 04:00:00 |
| 3 | 1105 | 42 | 607 | 704000 | 2000 | credit_card | 2017-03-24 04:00:00 |
| 4 | 1363 | 42 | 607 | 704000 | 2000 | credit_card | 2017-03-15 04:00:00 |
data.describe()
| order_id | shop_id | user_id | order_amount | total_items | |
|---|---|---|---|---|---|
| count | 5000.000000 | 5000.000000 | 5000.000000 | 5000.000000 | 5000.00000 |
| mean | 2500.500000 | 50.078800 | 849.092400 | 3145.128000 | 8.78720 |
| std | 1443.520003 | 29.006118 | 87.798982 | 41282.539349 | 116.32032 |
| min | 1.000000 | 1.000000 | 607.000000 | 90.000000 | 1.00000 |
| 25% | 1250.750000 | 24.000000 | 775.000000 | 163.000000 | 1.00000 |
| 50% | 2500.500000 | 50.000000 | 849.000000 | 284.000000 | 2.00000 |
| 75% | 3750.250000 | 75.000000 | 925.000000 | 390.000000 | 3.00000 |
| max | 5000.000000 | 100.000000 | 999.000000 | 704000.000000 | 2000.00000 |
data['order_amount'].mean()
3145.128
(data.order_amount/data.total_items).mean()
387.7428
data["aov"] = data.order_amount/data.total_items
data.sort_values(by = ['aov'], ascending=False)
| order_id | shop_id | user_id | order_amount | total_items | payment_method | created_at | aov | |
|---|---|---|---|---|---|---|---|---|
| 2234 | 2496 | 78 | 707 | 51450 | 2 | cash | 2017-03-26 04:38:52.497 | 25725.0 |
| 2216 | 2453 | 78 | 709 | 51450 | 2 | cash | 2017-03-27 11:04:04.363 | 25725.0 |
| 2241 | 2513 | 78 | 935 | 51450 | 2 | debit | 2017-03-18 18:57:13.421 | 25725.0 |
| 4532 | 3781 | 78 | 889 | 25725 | 1 | cash | 2017-03-11 21:14:49.542 | 25725.0 |
| 2361 | 2819 | 78 | 869 | 51450 | 2 | debit | 2017-03-17 06:25:50.921 | 25725.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 118 | 246 | 92 | 890 | 360 | 4 | cash | 2017-03-18 17:42:28.606 | 90.0 |
| 3849 | 1844 | 92 | 987 | 90 | 1 | debit | 2017-03-06 07:01:09.439 | 90.0 |
| 1817 | 1280 | 92 | 871 | 180 | 2 | cash | 2017-03-26 16:56:18.843 | 90.0 |
| 771 | 1928 | 92 | 960 | 270 | 3 | debit | 2017-03-21 03:44:07.817 | 90.0 |
| 4701 | 4220 | 92 | 747 | 90 | 1 | credit_card | 2017-03-25 20:16:57.851 | 90.0 |
5000 rows × 8 columns
fig = px.box(data, y="aov")
fig.show()
data_filtered = data[data["shop_id"]!=78]
data_filtered.describe()
| order_id | shop_id | user_id | order_amount | total_items | aov | |
|---|---|---|---|---|---|---|
| count | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 |
| mean | 2498.990916 | 49.819540 | 848.919257 | 2717.367784 | 8.851029 | 152.475575 |
| std | 1444.498907 | 29.014845 | 87.846007 | 41155.996469 | 116.857286 | 31.260218 |
| min | 1.000000 | 1.000000 | 607.000000 | 90.000000 | 1.000000 | 90.000000 |
| 25% | 1248.250000 | 24.000000 | 775.000000 | 163.000000 | 1.000000 | 132.000000 |
| 50% | 2494.500000 | 50.000000 | 849.000000 | 284.000000 | 2.000000 | 153.000000 |
| 75% | 3750.750000 | 74.000000 | 925.000000 | 390.000000 | 3.000000 | 168.000000 |
| max | 5000.000000 | 100.000000 | 999.000000 | 704000.000000 | 2000.000000 | 352.000000 |
(data_filtered.order_amount/data_filtered.total_items).mean()
152.47557529269278
data_filtered.describe()
| order_id | shop_id | user_id | order_amount | total_items | aov | |
|---|---|---|---|---|---|---|
| count | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 | 4954.000000 |
| mean | 2498.990916 | 49.819540 | 848.919257 | 2717.367784 | 8.851029 | 152.475575 |
| std | 1444.498907 | 29.014845 | 87.846007 | 41155.996469 | 116.857286 | 31.260218 |
| min | 1.000000 | 1.000000 | 607.000000 | 90.000000 | 1.000000 | 90.000000 |
| 25% | 1248.250000 | 24.000000 | 775.000000 | 163.000000 | 1.000000 | 132.000000 |
| 50% | 2494.500000 | 50.000000 | 849.000000 | 284.000000 | 2.000000 | 153.000000 |
| 75% | 3750.750000 | 74.000000 | 925.000000 | 390.000000 | 3.000000 | 168.000000 |
| max | 5000.000000 | 100.000000 | 999.000000 | 704000.000000 | 2000.000000 | 352.000000 |
fig = px.box(data_filtered, y="aov")
fig.show()
print("Mean: ",data_filtered.aov.mean())
print("Median: ",data_filtered.aov.median())
print("Mode: ",data_filtered.aov.mode()[0])
Mean: 152.47557529269278 Median: 153.0 Mode: 153.0
print("Mean: ",data.aov.mean())
print("Median: ",data.aov.median())
print("Mode: ",data.aov.mode()[0])
Mean: 387.7428 Median: 153.0 Mode: 153.0
By following the best practices and taking no findings into consideration, we aren't cleaning the data. Here, metric to be used can be Mode or Median, which looks like a more affordable value than the mean of aov